{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/v-zhukaijie/anaconda3/envs/promptbench/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "import promptbench as pb\n",
    "from promptbench.models import LLMModel\n",
    "from promptbench.dyval import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['arithmetic', 'linear_equation', 'bool_logic', 'deductive_logic', 'abductive_logic', 'reachability', 'max_sum_path']\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|          | 0/500 [00:00<?, ?it/s][nltk_data] Downloading package words to\n",
      "[nltk_data]     /home/v-zhukaijie/nltk_data...\n",
      "[nltk_data]   Package words is already up-to-date!\n",
      "100%|██████████| 500/500 [00:58<00:00,  8.59it/s]\n"
     ]
    }
   ],
   "source": [
    "# main function, defines the pipeline of dyval\n",
    "print(pb.dyval.DYVAL_DATASETS)\n",
    "dataset_name = \"arithmetic\"\n",
    "prompts = DYVAL_PROMPTS[dataset_name]\n",
    "\n",
    "# create dataset and model\n",
    "dataset = DyValDataset(dataset_name, \n",
    "                        is_trainset=False, # if true, it will generate a training dataset including inference steps for each problem\n",
    "                        num_samples=500, # number of samples generated\n",
    "                        num_nodes_per_sample=10, # number of nodes per sample, used in general DAG generation\n",
    "                        min_links_per_node=1, # minimum number of links per node, used in general DAG generation\n",
    "                        max_links_per_node=4, # maximum number of links per node, used in general DAG generation\n",
    "                        depth=3, # depth of the DAG, used in tree DAG generation\n",
    "                        num_children_per_node=2, # number of children per node, used in tree DAG generation\n",
    "                        extra_links_per_node=0, # it controls if we add extra links per node to add complexity\n",
    "                        add_rand_desc=0, # it controls if we add random descriptions to the nodes to add complexity\n",
    "                        delete_desc=0, # it controls if we delete descriptions to the nodes, if delete, the problem is unsolvable since some nodes are not described\n",
    "                        add_cycles=0, # it controls if we add cycles to the DAG, if add, the problem is unsolvable since there are loops\n",
    "                        )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'answers': 48.0,\n",
       " 'vars': 'aag',\n",
       " 'descriptions': 'The value of aae is 10.\\nThe value of aab is 10.\\nThe value of aaa is 8.\\naac gets its value by dividing the value of aaa by those of aab.\\nThe value of aad is 6.\\naaf gets its value by multiplying together the value of aad and aae.\\naag gets its value by multiplying together the value of aac and aaf.'}"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset[\"topological\"][0] # sample data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['google/flan-t5-large', 'llama2-7b', 'llama2-7b-chat', 'llama2-13b', 'llama2-13b-chat', 'llama2-70b', 'llama2-70b-chat', 'phi-1.5', 'gpt-3.5-turbo', 'gpt-4', 'gpt-4-1106-preview', 'gpt-3.5-turbo-1106', 'vicuna-7b', 'vicuna-13b', 'vicuna-13b-v1.3', 'google/flan-ul2']\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565\n",
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
     ]
    }
   ],
   "source": [
    "\n",
    "print(pb.SUPPORTED_MODELS)\n",
    "model = LLMModel(\"google/flan-t5-large\", \n",
    "                max_new_tokens=10, \n",
    "                temperature=0, \n",
    "                model_dir=None,\n",
    "                openai_key=None,\n",
    "                sleep_time=3\n",
    "                )\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/v-zhukaijie/anaconda3/envs/promptbench/lib/python3.8/site-packages/transformers/generation/configuration_utils.py:381: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2023-11-29 05:05:39.043831: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA\n",
      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Input: Here is a description of an arithmetic problem:\n",
      "The value of aae is 10.\n",
      "The value of aab is 10.\n",
      "The value of aaa is 8.\n",
      "aac gets its value by dividing the value of aaa by those of aab.\n",
      "The value of aad is 6.\n",
      "aaf gets its value by multiplying together the value of aad and aae.\n",
      "aag gets its value by multiplying together the value of aac and aaf.\n",
      "Compute the result of aag. If the solution cannot be calculated, answer 'N/A'. Ensure your result is within a relative precision of 0.0001 (or 0.01%) compared to the ground truth value. Ensure your final result begins with '<<<' and ends with '>>>', for example, if the answer is 1, your final result should be <<<1>>>.\n",
      "Raw Pred: <pad> Answer: <unk> 1>>></s>\n",
      "Pred: \n",
      "Answer: 48.0\n",
      "\n",
      "\n",
      "{'topological': 0.0}\n"
     ]
    }
   ],
   "source": [
    "# for each prompt, evaluate the score\n",
    "for prompt in prompts:\n",
    "    score = {}\n",
    "\n",
    "    # three orders of the dataset: topological, reversed, random\n",
    "    for order in [\"topological\"]:\n",
    "    # for order in [\"topological\", \"reversed\", \"random\"]:\n",
    "        data = dataset[order]\n",
    "        preds = []\n",
    "        answers = []\n",
    "        \n",
    "        for d in data[:1]:\n",
    "            input_text = pb.InputProcess.basic_format(prompt, d)\n",
    "            raw_pred = model(input_text)\n",
    "\n",
    "            # dyval preds are processed differently, please refer to the source code /promptbench/dyval/dyval_utils.py\n",
    "            pred = process_dyval_preds(raw_pred)\n",
    "            preds.append(pred)\n",
    "            answers.append(d[\"answers\"])\n",
    "\n",
    "            print(f\"Input: {input_text}\")\n",
    "            print(f\"Raw Pred: {raw_pred}\")\n",
    "            print(f\"Pred: {pred}\")\n",
    "            print(f\"Answer: {d['answers']}\")\n",
    "            print(\"\\n\")\n",
    "\n",
    "        score[order] = dyval_evaluate(dataset.dataset_type, preds, answers)\n",
    "\n",
    "print(score)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "promptbench",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
